## Make colors for plots
fabcolors = RColorBrewer::brewer.pal(n = 11,name = 'RdGy')
col1 = RColorBrewer::brewer.pal(n = 10,name = 'PRGn')
col2 = RColorBrewer::brewer.pal(n = 10,name = 'Spectral')
col3 = RColorBrewer::brewer.pal(n = 10,name = 'BrBG')
col4 = RColorBrewer::brewer.pal(n = 10,name = 'PiYG')
col5 = RColorBrewer::brewer.pal(n = 10,name = 'PuOr')
col6 = RColorBrewer::brewer.pal(n = 10,name = 'RdBu')
allcolors <- c(fabcolors, col1,col2,col3, col4, col5, col6)
allcolors <- list(allcolors)
morecolors1 <- wes_palette("Darjeeling1", n=4, type = "discrete")
morecolors1 <- list(morecolors1)
morecolors2 <- wes_palette("Moonrise2", n=3, type = "discrete")
morecolors2 <- list(morecolors2)
color_list <- c(allcolors, morecolors1, morecolors2)
This document describes training a random forest model using sample-wise immune cell signatures generated by mcp counter using RNASeq data as its input features. We chose to train a random forest classifier to identify NF tumortypes using the mcp counter signatures for the following features of the model :
Our goal is to find important latent variables that classify the various tumorTypes. We will then inspect the classifying features to find meaningful genesets that distinguish between two tumortypes.
Tumortypes represented in the data:
The dataset was split into 75% training and 25% testing dataset. The function createDataPartition is used to create balanced splits of the data. Since the tumorType argument to this function is a factor, the random sampling occurs within each class and should preserve the overall class distribution of the data.
#Make the test and training datasets
#set.seed(998) (if you want to keep the exact same training and testing dataset then uncomment this line)
inTraining <- createDataPartition(as.factor(forest_data$tumorType), p = .75, list = FALSE)
training <- forest_data[ inTraining,]
testing <- forest_data[-inTraining,]
We first trained an initial model on the training dataset using iterative mtrys and 1000 trees. The model was crossvalidated using 5-fold crossvalidation technique.
Below are details of our initial model.
## Load Fit data (The Model described in this document is stored on Synapse)
#load(synGet("syn21201190")$path)
# 10 fold validation control
fitControl <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 5)
tunegrid <- expand.grid(.mtry=c(1:10))
#Find the classes:
print("Summary of classes in training data")
## [1] "Summary of classes in training data"
summary(training$tumorType)
## Cutaneous Neurofibroma
## 25
## High Grade Glioma
## 12
## Low Grade Glioma
## 27
## Malignant Peripheral Nerve Sheath Tumor
## 10
## Neurofibroma
## 9
## Plexiform Neurofibroma
## 15
## Construct the random forest model called Fit (the code is commented out to facilitate quick rendering of html file by loading the Fit from Synapse)
# set.seed(9998)
Fit_init <- train(tumorType ~ .,
data = training[,c(1:ncol(training))],
method = "rf",
ntree= 1000,
#mtry= c(1:10)
tuneGrid = tunegrid,
#classwt =
proximity=TRUE,
importance = TRUE,
trControl = fitControl,
verbose = TRUE)
print("Check the fit of theinitial model")
## [1] "Check the fit of theinitial model"
Fit_init
## Random Forest
##
## 98 samples
## 10 predictors
## 6 classes: 'Cutaneous Neurofibroma', 'High Grade Glioma', 'Low Grade Glioma', 'Malignant Peripheral Nerve Sheath Tumor', 'Neurofibroma', 'Plexiform Neurofibroma'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times)
## Summary of sample sizes: 77, 78, 79, 79, 79, 78, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 1 0.5875556 0.4720673
## 2 0.5897444 0.4801725
## 3 0.5900501 0.4807098
## 4 0.5905464 0.4815103
## 5 0.5841253 0.4739759
## 6 0.5884411 0.4795845
## 7 0.5882406 0.4792475
## 8 0.5861353 0.4771648
## 9 0.5864411 0.4779593
## 10 0.5843358 0.4753018
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 4.
#plot the model
theme_update(text = element_text(size=20))
ggplot(Fit_init$results, aes(x=mtry, y=Accuracy)) +
geom_point(aes(size=2)) +
geom_line() +
coord_cartesian(ylim = c(0,1)) +
labs(main="The model", x="mtry :: Number of features for each split", y= "Accuracy of the model")
print("Check the clustering of the samples according to the model")
## [1] "Check the clustering of the samples according to the model"
MDSplot(Fit_init$finalModel,
as.factor(training$tumorType),
k=2, palette=NULL,
pch=as.numeric(training$tumorType),
cex=2,
cex.axis= 1.5,
cex.lab = 1.5,
cex.main = 1.5,
main= "MDS Plot of the classified training set")
legend("topright",
inset=0.01,
cex= 1.0,
legend=levels(training$tumorType),
fill=brewer.pal(6, "Set1"))
#Use model to predict labels of test data
pred <- predict(Fit_init, newdata = testing[,c(1:length(colnames(testing)))])
#store predicted labels in the test dataframe
testing$pred <- pred
# Check the accuracy of the model
library(DT)
conf_matrix <- confusionMatrix(data = testing$pred,
reference = as.factor(testing$tumorType),
mode = "prec_recall")
#conf_matrix
## Make a performance histogram from initial iterations of the forest
perf <- as.data.frame(conf_matrix$byClass)
perf$Class <- rownames(perf)
perf <- perf %>%
dplyr::select(Class, F1)
#conf_matrix$table
We observed that tuning our model did not significantly improve the performance of the initial model. As a result we tried an ensemble approach and plot the performances of all possible models.
FeatureList <- list()
perf_new <- perf
# Load the Featurelist and perf list stored on synapse
load(synGet("syn21246234")$path)
#The model building code has been commented out below for quick rendering of html
# for (i in 1:500){
# # make new train-test set
# inTraining <- createDataPartition(as.factor(forest_data$tumorType), p = .75, list = FALSE)
# training <- forest_data[ inTraining,]
# testing <- forest_data[-inTraining,]
#
# #make new model
# Fit_new <- train(tumorType ~ .,
# data = training[,c(1:ncol(training))],
# method = "rf",
# ntree= 1000,
# #tuneGrid = tunegrid,
# #classwt =
# proximity=TRUE,
# importance = TRUE,
# trControl = fitControl,
# verbose = TRUE)
#
# # predict test set with the model to get F1 scores
# pred <- predict(Fit_new, newdata = testing[,c(1:length(colnames(testing)))])
# #store predicted labels in the test dataframe
# testing$pred <- pred
#
# #Make confusion matrix
# conf_matrix <- confusionMatrix(data = testing$pred,
# reference = as.factor(testing$tumorType),
# mode = "prec_recall")
#
# ## Store F1 scores from various iterations of the forest
# df <- as.data.frame(conf_matrix$byClass)
# perf_new[, glue('Iter{i}')] <- df$F1
# #perf[, Class] <- rownames(df)
#
#
# #Store Feature importance for all iterations in a list
# # estimate variable importance
# importance <- varImp(Fit_new, scale=FALSE)
#
# # Select top important features
# features <- as.data.frame(importance$importance)
#
# FeatureList[[i]] <- (features)
# }
# Plot histogram of all F1 scores
#Make long df
perf_new$Class <- as.factor(perf_new$Class)
perf_new_long <- gather(perf_new, iteration, All_scores, F1:Iter500, factor_key=TRUE)
par(mfrow=c(2,1))
theme_update(text = element_text(size = 10))
ggplot(perf_new_long, aes(x=All_scores, fill=Class, color=Class)) +
geom_histogram( binwidth=0.05, alpha=0.5, position="dodge") +
#geom_density(alpha=.2) +
theme(legend.position="top") +
#scale_color_manual(values=allcolors[[1]][12:18])
scale_color_brewer(palette="Spectral")+
scale_fill_brewer(palette="Spectral") +
labs(title="F1 scores for iterations of RF",x="F1 score", y = "Counts") +
theme(text = element_text(size=10)) +
theme(axis.text.x = element_text(size=10)) +
theme(axis.text.y = element_text(size=10))
ggplot(perf_new_long, aes(x=All_scores, fill=Class, color=Class)) +
#geom_histogram( binwidth=0.05, alpha=0.5, position="dodge") +
geom_density(alpha=.2) +
theme(legend.position="top") +
#scale_color_manual(values=allcolors[[1]][12:18])
scale_color_brewer(palette="Spectral")+
scale_fill_brewer(palette="Spectral") +
labs(title="F1 scores for iterations of RF",x="F1 score", y = "Counts") +
theme(text = element_text(size=10)) +
theme(axis.text.x = element_text(size=10)) +
theme(axis.text.y = element_text(size=10))
We also saved the features deemed important for each of the iterations of the model for each of our classes. We plot the scores of each of the features in each of our models.
# Plot a distribution of importance of features
features_cNF <- as.data.frame(sapply(FeatureList, `[[`, 1))
features_cNF$Class <- rownames(FeatureList[[1]])
cNF_long <- gather(features_cNF, iteration, All_scores, V1:V500, factor_key=TRUE)
theme_update(text = element_text(size = 10))
ggplot(cNF_long, aes(x=All_scores, fill=Class, color=Class)) +
#geom_histogram( binwidth=0.05, alpha=0.5, position="dodge") +
geom_density(alpha=.2) +
theme(legend.position="top") +
#scale_color_manual(values=allcolors[[1]][12:18])
scale_color_brewer(palette="Spectral")+
scale_fill_brewer(palette="Spectral") +
labs(title="Importance of features in cNF",x="Importance", y = "Counts")
features_HGG <- as.data.frame(sapply(FeatureList, `[[`, 2))
features_HGG$Class <- rownames(FeatureList[[1]])
HGG_long <- gather(features_HGG, iteration, All_scores, V1:V500, factor_key=TRUE)
ggplot(HGG_long, aes(x=All_scores, fill=Class, color=Class)) +
#geom_histogram( binwidth=0.05, alpha=0.5, position="dodge") +
geom_density(alpha=.2) +
theme(legend.position="top") +
#scale_color_manual(values=allcolors[[1]][12:18])
scale_color_brewer(palette="Spectral")+
scale_fill_brewer(palette="Spectral") +
labs(title="Importance of features in HGG",x="Importance", y = "Counts")
features_LGG <- as.data.frame(sapply(FeatureList, `[[`, 3))
features_LGG$Class <- rownames(FeatureList[[1]])
LGG_long <- gather(features_LGG, iteration, All_scores, V1:V500, factor_key=TRUE)
ggplot(LGG_long, aes(x=All_scores, fill=Class, color=Class)) +
#geom_histogram( binwidth=0.05, alpha=0.5, position="dodge") +
geom_density(alpha=.2) +
theme(legend.position="top") +
#scale_color_manual(values=allcolors[[1]][12:18])
scale_color_brewer(palette="Spectral")+
scale_fill_brewer(palette="Spectral") +
labs(title="Importance of features in LGG",x="Importance", y = "Counts")
features_MPNST <- as.data.frame(sapply(FeatureList, `[[`, 4))
features_MPNST$Class <- rownames(FeatureList[[1]])
MPNST_long <- gather(features_MPNST, iteration, All_scores, V1:V500, factor_key=TRUE)
ggplot(MPNST_long, aes(x=All_scores, fill=Class, color=Class)) +
#geom_histogram( binwidth=0.05, alpha=0.5, position="dodge") +
geom_density(alpha=.2) +
theme(legend.position="top") +
#scale_color_manual(values=allcolors[[1]][12:18])
scale_color_brewer(palette="Spectral")+
scale_fill_brewer(palette="Spectral") +
labs(title="Importance of features in MPNST",x="Importance", y = "Counts")
features_NF <- as.data.frame(sapply(FeatureList, `[[`, 5))
features_NF$Class <- rownames(FeatureList[[1]])
NF_long <- gather(features_NF, iteration, All_scores, V1:V500, factor_key=TRUE)
ggplot(NF_long, aes(x=All_scores, fill=Class, color=Class)) +
#geom_histogram( binwidth=0.05, alpha=0.5, position="dodge") +
geom_density(alpha=.2) +
theme(legend.position="top") +
#scale_color_manual(values=allcolors[[1]][12:18])
scale_color_brewer(palette="Spectral")+
scale_fill_brewer(palette="Spectral") +
labs(title="Importance of features in NF",x="Importance", y = "Counts")
features_pNF <- as.data.frame(sapply(FeatureList, `[[`, 6))
features_pNF$Class <- rownames(FeatureList[[1]])
pNF_long <- gather(features_pNF, iteration, All_scores, V1:V500, factor_key=TRUE)
ggplot(pNF_long, aes(x=All_scores, fill=Class, color=Class)) +
#geom_histogram( binwidth=0.05, alpha=0.5, position="dodge") +
geom_density(alpha=.2) +
theme(legend.position="top") +
#scale_color_manual(values=allcolors[[1]][12:18])
scale_color_brewer(palette="Spectral")+
scale_fill_brewer(palette="Spectral") +
labs(title="Importance of features in pNF",x="Importance", y = "Counts")
#save(FeatureList, perf_new, file = "RF_mcpcounter.RData")
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.5
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] glue_1.3.1 gridExtra_2.3
## [3] AnnotationDbi_1.46.0 IRanges_2.18.1
## [5] S4Vectors_0.22.0 Biobase_2.44.0
## [7] BiocGenerics_0.30.0 pheatmap_1.0.12
## [9] AppliedPredictiveModeling_1.1-7 caret_6.0-84
## [11] lattice_0.20-38 e1071_1.7-2
## [13] randomForest_4.6-14 wesanderson_0.3.6
## [15] RColorBrewer_1.1-2 colorspace_1.4-1
## [17] DT_0.8 forcats_0.4.0
## [19] stringr_1.4.0 dplyr_0.8.3
## [21] purrr_0.3.2 readr_1.3.1
## [23] tidyr_0.8.3 tibble_2.1.3
## [25] ggplot2_3.2.1 tidyverse_1.2.1
## [27] BiocManager_1.30.4 synapserutils_0.1.6
## [29] synapser_0.6.58
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-141 bit64_0.9-7 lubridate_1.7.4
## [4] httr_1.4.1 tools_3.6.0 backports_1.1.4
## [7] R6_2.4.0 rpart_4.1-15 DBI_1.0.0
## [10] lazyeval_0.2.2 nnet_7.3-12 withr_2.1.2
## [13] tidyselect_0.2.5 bit_1.1-14 compiler_3.6.0
## [16] cli_1.1.0 rvest_0.3.4 xml2_1.2.2
## [19] labeling_0.3 scales_1.0.0 digest_0.6.20
## [22] rmarkdown_1.14 pkgconfig_2.0.2 htmltools_0.3.6
## [25] htmlwidgets_1.3 rlang_0.4.0 readxl_1.3.1
## [28] RSQLite_2.1.2 rstudioapi_0.10 generics_0.0.2
## [31] jsonlite_1.6 ModelMetrics_1.2.2 CORElearn_1.53.1
## [34] magrittr_1.5 Matrix_1.2-17 Rcpp_1.0.2
## [37] munsell_0.5.0 stringi_1.4.3 yaml_2.2.0
## [40] MASS_7.3-51.4 plyr_1.8.4 recipes_0.1.7
## [43] blob_1.2.0 grid_3.6.0 crayon_1.3.4
## [46] haven_2.1.1 splines_3.6.0 hms_0.5.0
## [49] zeallot_0.1.0 knitr_1.24 pillar_1.4.2
## [52] reshape2_1.4.3 codetools_0.2-16 PythonEmbedInR_0.3.37
## [55] evaluate_0.14 data.table_1.12.2 modelr_0.1.5
## [58] vctrs_0.2.0 foreach_1.4.7 cellranger_1.1.0
## [61] gtable_0.3.0 assertthat_0.2.1 pack_0.1-1
## [64] xfun_0.8 gower_0.2.1 prodlim_2019.10.13
## [67] broom_0.5.2 class_7.3-15 survival_2.44-1.1
## [70] timeDate_3043.102 iterators_1.0.12 memoise_1.1.0
## [73] ellipse_0.4.1 cluster_2.1.0 lava_1.6.6
## [76] ipred_0.9-9